In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/data-model1/data_model1.csv
/kaggle/input/model2/Model 2.txt
/kaggle/input/model3/Model 3.txt
/kaggle/input/df-atp/df_atp.csv
/kaggle/input/data-model2/data_model2.csv
/kaggle/input/data-model4/data_model4.csv
/kaggle/input/model1/Model 1.txt
/kaggle/input/data-model3/data_model3.csv
/kaggle/input/data-4qcsv/data_4q.csv
/kaggle/input/model4/Model 4.txt

Yes!! Here we are again!! This is the third part of our ATP journey!! =) This time is for explaining why our models made the predictions from 4 models that we built in our previous notebook. The models were all for a binary classification using lightGBM, a tree-based learning algotighm. The models were:

  1. Baseline model
  2. Categorical Features (LabelEncoding)
  3. Feature selection (RandomForestClassifier)
  4. Good model adding noise

Lets remember that the main difference from the 1st and 4th model is that we swapped all the columns correctly: in the first attempt (1st model) we only swapped the columns of the players but we didn't swap their respective columns (e.g. the column "WRank" - the rank position of the winner of the match - was not swapped with the "LRank" - the rank position of the loser of the match... even if the player's order in the sample was swapped!). You can imagine that we created a difficult data to learn!! Interestingly to observe that, swapping all the columns can reach a validation accuracy of 1!! This is why we opted to add a noise parameter to our 4th model, so trying to avoid overfitting (altought you'll observe that we didn't add to much noise in this example!)

In [2]:
def get_data_splits(dataframe, valid_fraction=0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe) * valid_fraction)

    train = dataframe[:-valid_size * 2]
    valid = dataframe[-valid_size * 2:-valid_size]
    test = dataframe[-valid_size:]
    #print(f"Train size : {len(train)}\nValidation size : {len(valid)}\nTest size : {len(test)}")
    return train, valid, test
In [3]:
import lightgbm as lgb
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt 
import joblib
dire1 = "../input/model1/Model 1.txt"
bst1 = joblib.load(dire1)

dire2 = "../input/model2/Model 2.txt"
bst2 = joblib.load(dire2)

dire3 = "../input/model3/Model 3.txt"
bst3 = joblib.load(dire3)

dire4 = "../input/model4/Model 4.txt"
bst4 = joblib.load(dire4)
In [4]:
# df_atp = pd.read_csv("../input/df-atp/df_atp.csv",index_col=0)
# df = pd.read_csv("../input/data-4qcsv/data_4q.csv",index_col=0)
data_model1 = pd.read_csv("../input/data-model1/data_model1.csv",index_col=0)
data_model2 = pd.read_csv("../input/data-model2/data_model2.csv",index_col=0)
data_model3 = pd.read_csv("../input/data-model3/data_model3.csv",index_col=0)
data_model4 = pd.read_csv("../input/data-model4/data_model4.csv",index_col=0)

How could one explain a machine learning model?

Lets start with SHAP! SHAP values are a very interesting way to explain any Machine Learning model! More than visually friendly for anyone, (welcoming also those not-expert in ML) it's a way to explain WHY the model made certain choice instead of other. You can ask yourself why this should be important. Well, in a world that is processing tons of data, the concern about why an automated veihule chose to turn right instead of left in a crowded crosroad may be of extreme importance not only to prevent eventual problems but also to build trust (quite important if we consider that we are delegating more and more human-tasks to machines). In a few words, SHAP values will build a tree of dependence between all the variables of the model and tell you how much each variable contributted to make such prediction.

Looking at the examples below will aid us better understand!

To help us get all the necessary parameters and models, I'll create a bag of models:

In [5]:
import numpy as np
import shap
valid_size = int(len(data_model1) * 0.1)
        
train1, valid1, test1 = get_data_splits(data_model1)

train2, valid2, test2 = get_data_splits(data_model2)
train3, valid3, test3 = get_data_splits(data_model3)
train4, valid4, test4 = get_data_splits(data_model4)

bag = {1:{"train":train1,"valid":valid1,"test":test1,"bst":bst1},\
       2:{"train":train2,"valid":valid2,"test":test2,"bst":bst2},\
       3:{"train":train3,"valid":valid3,"test":test3,"bst":bst3},\
       4:{"train":train4,"valid":valid4,"test":test4,"bst":bst4}}

Single Predictions

Here we're gonna look at a single prediction of all the models. The sample to predict is assigned with the number of the row ("row_to_show") of our dataset!

In [6]:
    
shap.initjs()
for i in range(1,5):
    #### SINGLE PREDICTION
    feature_cols = bag[i]["valid"].columns.drop('Labels')
    
    row_to_show = 0      ############# I'm looking here at the very first sample of our validation set! Change it to play with other examples! 
    
    data_for_prediction = bag[i]["valid"].drop("Labels",axis=1).iloc[row_to_show]
    data_for_prediction_array = data_for_prediction.values.reshape(1, -1)
    idx = bag[i]["valid"].index[0]
    print(f"Model number : {i}\nprediction of model : ",bag[i]["bst"].predict(data_for_prediction_array), " ground truth : ", bag[i]["valid"]["Labels"][idx + row_to_show])

    data_ser = data_for_prediction.values.reshape(1, -1)
    
    ###### We call the explainer!!
    explainer = shap.TreeExplainer(bag[i]["bst"])

    shap_values = explainer.shap_values(data_ser)

    #display(explainer.expected_value,shap_values)    #### Uncomment to see what the shap_values are about!! It's an array with all the variables' shap values!

    display(shap.force_plot(explainer.expected_value[1], shap_values[1], data_ser, feature_names = feature_cols))
Model number : 1
prediction of model :  [0.52325245]  ground truth :  0
Usage of np.ndarray subset (sliced data) is not recommended due to it will double the peak memory cost in LightGBM.
LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Model number : 2
prediction of model :  [0.55312608]  ground truth :  0
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Model number : 3
prediction of model :  [0.589512]  ground truth :  0
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Model number : 4
prediction of model :  [0.00367785]  ground truth :  0
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

Fancy plots, aren't they!!?? Yes... but they mean???

First of all: the only model that predicted correctly was the 4th, right? As the prediction score was 0.003 (output as class "0") and the label was 0!

  1. The colors indicate that: blue bars increase the model to predict as a negative case (output 0) and red bars increase the scores for predicting the sample as a positive case (output "1"). This is way we have a much bigger blue part in the 4th model than on the others (as the others predicted as "1")
  2. We can see that for the 3 first models, the "WRank" was an important feature contributting for the prediction scores. This is represented by the SIZE of the bar: bigger bar, bigger the SHAP value, thus, bigger its influence on the model prediction
  3. We can also observe that, despite missing the prediction, the 3 first models were quite "unsure" about the data as the prediction score was always surrounding our threshold of decision (0.5: score values < 0.5 are predicted as label "0" and score values > 0.5 as label "1"). This is not quite surprinsing, if we remember that those models were unlike to understand to whom the variables belonged to, as we didn't swap them accordingly to the player's order!

Another cool feature of SHAP is the "dependence_plo", where we can visualize the mean contribution of each model feature!! We call this plot with our function most_import

In [7]:
def most_import(shap_values,feature_cols,data):
    feature_cols = feature_cols
    display(shap.summary_plot(shap_values, data,feature_names = feature_cols))
In [8]:
def get_shap(m,part):
    
    feature_cols = bag[m]["valid"].columns.drop('Labels') 
    explainer = shap.TreeExplainer(bag[m]["bst"])
    data = bag[m][part][feature_cols]
    shap_values = explainer.shap_values(data)
    
    return shap_values, feature_cols, data

def get_idx(n,m):
    return [idx2 for idx2,x in enumerate(bag[m]["valid"].columns.drop("Labels")) if x == feat_3[n]]
In [9]:
### Lets take the Model 2 to analyse how the features play their importance in the prediction.
model = 3

shap_values, feat_cols, data = get_shap(model,'valid')

most_import(shap_values,feat_cols,data)
None

So, the plot above tell us that for the 3rd model, the 3 most important features contributing for assigning a sample as "1" or "0" are "Oponent_cat", "Player1_cat" and "WRank" (not surpring that Oponent-cat and Player1-cat have almost the same importance as they represent the categorical values of the players' names)

Overral analyse

Another interest tool of SHAP is to look at a group of predictions!! For that purpose, we create the function overall

In [10]:
#### OVERALL OUTPUT:
def overall(m1,m2):
    """
    m1 : number of model 1
    m2 : number of model 2
    """
    for i in range(m1,m2+1):
        feature_cols = bag[i]["valid"].columns.drop('Labels')
        explainer = shap.TreeExplainer(bag[i]["bst"])
        data = bag[i]['valid'][feature_cols]
        shap_values = explainer.shap_values(data)
    
        display(shap.force_plot(explainer.expected_value[1], shap_values[1][:1000,:], feature_cols)) #### we're taking 1000 samples from our valid set!
        

        ##  for name in valid.columns.drop("Labels"): ################# scatter plots between the model features!
        #     shap.dependence_plot(name, shap_values[1], valid[feature_cols])
In [11]:
overall(1,3) ### We're just visualizing the first 3 models!
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

OBS : te xaxis is ordered by SIMILARITY!! So don't confuse with the sample order in our valid set! To change that, change the up-tab for each graph and select original sample order (it will be less pretty but more consistency with the axis number ordering 0-1000)

In [ ]:
 
In [12]:
def roundIt(dic,n,r):
    for i in range(1,n):
        dic[i]['tp'] = round(float(dic[i]['tp']),r)
        dic[i]['tn'] = round(float(dic[i]['tn']),r)
        dic[i]['fp'] = round(float(dic[i]['fp']),r)
        dic[i]['fn'] = round(float(dic[i]['fn']),r)
    return dic

def get_conf_dict(n):
    y = [round(bag_ft[n]['tn'],3),bag_ft[n]['fn'],bag_ft[n]['tp'],bag_ft[n]['fp']]
    return y
In [13]:
i = 0
th = 0.5
feat_3 = ["Oponent_cat","Player1_cat", "WRank"] ### Most impactful features!
n_feat = len(feat_3)


m = 2 ### model number

shap_values, feature_cols, _ = get_shap(m,'valid')

bag_ft = {1:{"tp":0,"tn":0,"fp":0,"fn":0,"idx":get_idx(0,m),"feature":feat_3[0]},\
          2:{"tp":0,"tn":0,"fp":0,"fn":0,"idx":get_idx(1,m),"feature":feat_3[1]},\
          3:{"tp":0,"tn":0,"fp":0,"fn":0,"idx":get_idx(2,m),"feature":feat_3[2]}}

#print(bag_ft[1]["idx"])

y_true = bag[m]["valid"]["Labels"][:1000]
y_pred = bag[m]['bst'].predict(bag[m]["valid"][feature_cols])

for each in shap_values[1]:
    #print(each)
    
    while i < 1000:
        val1 = round(float(each[bag_ft[1]["idx"]]),3)
        val2 = round(float(each[bag_ft[2]["idx"]]),3)
        val3 = round(float(each[bag_ft[3]["idx"]]),3)
        idx = y_true.index[i]
        #print("ytrue, ypred, i : ",y_true[idx],y_pred3[i],i)
        if y_true[idx] == 1 and y_pred[i] >= th: #### TP
            #tpO = tpO + abs(val)
            bag_ft[1]["tp"] = bag_ft[1]["tp"] + abs(val1)
            bag_ft[2]["tp"] = bag_ft[2]["tp"] + abs(val2)
            bag_ft[3]["tp"] = bag_ft[3]["tp"] + abs(val3)
            i = i+ 1
            continue
        if y_true[idx] == 1 and y_pred[i] < th: #### FN
            #fnO = fnO + abs(val)
            bag_ft[1]["fn"] = bag_ft[1]["fn"] + abs(val1)
            bag_ft[2]["fn"] = bag_ft[2]["fn"] + abs(val2)
            bag_ft[3]["fn"] = bag_ft[3]["fn"] + abs(val3)
            i = i+ 1
            continue
        if y_true[idx] == 0 and y_pred[i] >= th: #### FP
            #fpO = fpO + abs(val)
            bag_ft[1]["fp"] = bag_ft[1]["fp"] + abs(val1)
            bag_ft[2]["fp"] = bag_ft[2]["fp"] + abs(val2)
            bag_ft[3]["fp"] = bag_ft[3]["fp"] + abs(val3)
            i = i+ 1
            continue
        if y_true[idx] == 0 and y_pred[i] < th: #### TN
            #tnO = tnO + abs(val)
            bag_ft[1]["tn"] = bag_ft[1]["tn"] + abs(val1)
            bag_ft[2]["tn"] = bag_ft[2]["tn"] + abs(val2)
            bag_ft[3]["tn"] = bag_ft[3]["tn"] + abs(val3)
            i = i+ 1

bag_ft = roundIt(bag_ft,4,None)  
print(bag_ft)
{1: {'tp': 37, 'tn': 37, 'fp': 19, 'fn': 17, 'idx': [51], 'feature': 'Oponent_cat'}, 2: {'tp': 4, 'tn': 4, 'fp': 2, 'fn': 2, 'idx': [50], 'feature': 'Player1_cat'}, 3: {'tp': 19, 'tn': 19, 'fp': 10, 'fn': 9, 'idx': [1], 'feature': 'WRank'}}
In [14]:
# data = [get_conf_dict(1), get_conf_dict(2), get_conf_dict(3)] 
  
# # Create the pandas DataFrame 
# df = pd.DataFrame(data, columns = ['TN', 'FN','TP','FP']) 
  
# # print dataframe. 
# df
In [15]:
# def get_bar_posneg():
#     fig = plt.figure(figsize=(20,10))
#     ind = np.arange(1,5)
    
In [16]:
#-5.420885622420833 -5.184509795861786 -2.8049931418340392 -2.347999877153214
fig = plt.figure(figsize=(20,10))
ind = np.arange(1,5)
cat = 3
y = get_conf_dict(cat)
print(y,ind)

plt.bar(ind, y)
plt.ylabel('Scores SHAP values',fontsize=20)
plt.title(f'Impact of {bag_ft[cat]["feature"]} on samples',fontsize=20)
plt.xticks(ind, ('TN', 'FN', 'TP', 'FP'),fontsize=20)
plt.yticks(np.arange(0.0, 22.0, 1.0),fontsize=20)
#plt.legend((p1[0], p2[0]), (f'Model {m1}', f'Model {m2}'),fontsize=20)
for i, v in enumerate(y):
    print(i,v)
    v = round(v,3)
    plt.text(i+0.95, v/2, str(v), color='black', fontweight='bold',fontsize=20)
plt.show()
[19, 9, 19, 10] [1 2 3 4]
0 19
1 9
2 19
3 10
In [17]:
##### Get the stacked bar plot of most important features impact on pos/neg cases
#################################################################################

fig = plt.figure(figsize=(20,10))
ind = np.arange(4)

y1 = get_conf_dict(2)
y2 = get_conf_dict(1)
y3 = get_conf_dict(3)
width = 0.45

p1 = plt.bar(ind, y1,width)           ### Player1
p2 = plt.bar(ind, y2,width,bottom=y1) ### Oponent_cat
p3 = plt.bar(ind, y3,width,bottom=y2) ### WRank


plt.xticks(ind, ('TN', 'FN', 'TP', 'FP'),fontsize=20)
plt.yticks(np.arange(0, 150, 10),fontsize=20)
plt.legend((p1[0], p2[0],p3[0]), (f'{bag_ft[2]["feature"]}',\
                                  f'{bag_ft[1]["feature"]}',\
                                  f'{bag_ft[3]["feature"]}'),fontsize=20)

plt.ylabel('Scores SHAP values',fontsize=20)
plt.title(f'SHAP Impact of most important features on pos/neg cases on model {m}',fontsize=20)


for i, v in enumerate(y1):
    #print(i,v)
    plt.text(i - 0.02, v/2, str(v), color='black', fontweight='bold',fontsize=20)
    
for i, v in enumerate(y2):
    #print(i,v)
    plt.text(i - 0.04, v/2 + y1[i], str(v), color='pink', fontweight='bold',fontsize=20)
for i, v in enumerate(y3):
    #print(i,v)
    plt.text(i - 0.04, v/2 + y2[i], str(v), color='white', fontweight='bold',fontsize=20)


plt.show()
In [18]:
def conf_mat(n,a,b):
    
    feature_cols = bag[n]["valid"].columns.drop('Labels')
    y_pred = bag[n]['bst'].predict(bag[n]["valid"][feature_cols][a:b])
    
    th = 0.5
    y_pred_class = y_pred > th

    cm = confusion_matrix(bag[n]["valid"]["Labels"][a:b], y_pred_class)
    tn, fp, fn, tp = cm.ravel()
    #display(tn,fn,tp, fp)
    
    return [tn, fp, fn, fp], y_pred
In [19]:
##### Get the differences in pos/neg between 2 MODELS and in a particular period analysed on the "overall" function
N = 4

##### Model 1 & 2
##### Period: 190:250
a = 160
b = 190
m1= 1
m2= 2
fig = plt.figure(figsize=(35,10))
conf1, y_pred1 = conf_mat(m1,a,b)
conf2, y_pred2 = conf_mat(m2,a,b)

# menMeans = (20, 35, 30, 35, 27)
# womenMeans = (25, 32, 34, 20, 25)
model1 = conf1
model2 = conf2

menStd = (2, 3, 4, 1)
womenStd = (3, 5, 2, 3 )
ind = np.arange(N)    # the x locations for the groups
width = 0.35       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, model1, width,)
p2 = plt.bar(ind, model2, width,
             bottom=model1)
for i, v in enumerate(model1):
    #print(i,v)
    plt.text(i - 0.025, v/2, str(v), color='black', fontweight='bold',fontsize=20)
    
for i, v in enumerate(model2):
    #print(i,v)
    plt.text(i - 0.025, v/2 + model1[i], str(v), color='gray', fontweight='bold',fontsize=20)
    
plt.ylabel('Scores',fontsize=20)
plt.title(f'Scores models {m1} & {m2} for samples in {a}:{b}',fontsize=20)
plt.xticks(ind, ('TN', 'FN', 'TP', 'FP'),fontsize=20)
plt.yticks(np.arange(0, 51, 5),fontsize=20)
plt.legend((p1[0], p2[0]), (f'Model {m1}', f'Model {m2}'),fontsize=20)


plt.show()

We can notice that in that region we have exactly the same amount of positive and negative cases (30 per each). That is to say: 30 times the "1" and 30 times the "0"

In [20]:
# #We can notice that in that region we have exactly the same amount of
# #positive and negative cases (30 per each). That is to say: 30 times the "1" and 30 times the "0"
# y_true = bag[2]["valid"]["Labels"][:1000]
# y_true[190:250].sum()